ML Analysis of LGG Samples from Liquid Biopsy¶

Author: Shehbeel Arif¶

Preclinical Laboratory Research Unit¶

The Center for Data Driven Discovery in Biomedicine (D3b)¶

Children's Hospital of Philadelphia¶

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import plotly.express as px
In [2]:
meta = pd.read_csv('lgg_lb_meta.csv')
meta = meta.set_index(['SDG_ID'])
#meta
In [3]:
df = pd.read_csv('lb_plasma_matrix.csv')
tdf = df.T
tdf.columns = tdf.iloc[0] 
tdf = tdf[1:]
#tdf
In [4]:
main_df = pd.concat([meta, tdf], axis=1, join="inner")
#main_df

Using Short Histology¶

In [5]:
short_histology_df = main_df.drop(['Specimen_Type', 'Diagnosis', 'Tumor_Subtype', 'Relapse', 'Survival_Status'],axis=1)
short_histology_df
Out[5]:
Short_Histology CTRL_ANT1 CTRL_ANT2 CTRL_ANT3 CTRL_ANT4 CTRL_ANT5 CTRL_miR_POS HK_ACTB HK_B2M HK_GAPDH ... miR-944 miR-95-3p miR-95-5p miR-9-5p miR-96-3p miR-96-5p miR-98-3p miR-99a-5p miR-99b-3p miR-99b-5p
15635-37 LGG 16 16 22 25 5 20225 27 109 6245 ... 23 44 31 19 21 125 38 891 75 290
15635-43 HGG 35 60 49 60 47 51017 30 103 12707 ... 44 83 76 37 37 95 96 1876 78 662
15635-45 HGG 122 139 145 95 127 16810 145 1535 3414 ... 79 179 212 179 113 430 111 1141 102 1034
15635-46 LGG 41 60 99 68 49 58350 95 239 29270 ... 11 99 172 67 35 62 118 1972 143 596
15635-53 HGG 6 20 3 27 5 20365 39 302 6065 ... 13 51 27 7 20 316 26 2028 110 540
15635-60 LGG 12 68 43 102 35 87035 58 199 19675 ... 53 98 127 28 104 20 123 1020 174 370
15635-68 LGG 99 86 36 99 60 39440 79 748 15361 ... 61 190 162 98 75 256 67 1842 168 1263
15635-80 LGG 56 87 97 461 47 7490 77 150 17922 ... 68 82 142 65 55 263 37 377 217 960
15635-87 HGG 76 77 97 88 98 44954 45 262 4624 ... 117 201 202 47 65 297 55 3270 248 896
15635-90 LGG 69 52 76 110 68 20213 67 175 5244 ... 40 84 56 81 73 168 46 3492 155 282
15635-100 LGG 0 17 26 20 18 38373 195 3374 17375 ... 0 57 62 24 22 120 72 3586 73 919
15635-101 LGG 39 30 21 33 23 19047 17 86 8125 ... 39 62 92 39 27 126 33 1508 64 254
15635-127 LGG 16 6 19 73 16 24390 33 304 8055 ... 7 28 50 38 10 113 27 865 44 335
15635-134 LGG 28 50 19 64 4 47361 302 1665 12094 ... 60 147 104 40 31 180 75 2621 154 1349
15635-154 HGG 9 30 3 26 11 22451 5 55 2794 ... 8 92 65 15 52 259 29 985 69 467
15635-156 HGG 45 14 36 15 23 37931 75 837 2349 ... 46 220 67 0 0 307 40 8387 128 1737
15635-239 HGG 24 24 77 92 24 23781 105 223 306 ... 44 110 94 49 27 340 53 2902 189 823

17 rows × 2103 columns

In [6]:
# Split the dataset into features and labels
sh_X = short_histology_df.loc[:, short_histology_df.columns != 'Short_Histology'].values
sh_y = short_histology_df.loc[:, short_histology_df.columns == 'Short_Histology'].values.ravel()
In [7]:
# Split data into training and testing set
sh_X_train, sh_X_test, sh_y_train, sh_y_test = train_test_split(sh_X, sh_y, test_size=0.25, random_state=42)

#Sanity check
print(sh_X_train.shape, sh_X_test.shape, sh_y_train.shape, sh_y_test.shape)
(12, 2102) (5, 2102) (12,) (5,)
In [8]:
# Class Imbalance
fig = px.histogram(short_histology_df, x='Short_Histology')
fig.show()
In [9]:
# Initialize random forest classifier
sh_rf = RandomForestClassifier(max_depth=2, random_state=0)

# Train the random forest classifier
sh_rf.fit(sh_X_train, sh_y_train)

# Make predictions using random forest classifier
sh_rf_y_pred = sh_rf.predict(sh_X_test)
In [10]:
# Accuracy of model
print(f'Accuracy: {accuracy_score(sh_y_test, sh_rf_y_pred)}')
Accuracy: 0.6
In [11]:
# Calculate a confusion matrix
sh_cm = confusion_matrix(sh_y_test, sh_rf_y_pred, labels=sh_rf.classes_)

# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(sh_cm, text_auto=True,
                labels=dict(x="True Subtype", y="Predicted Subtype", color="Productivity"),
                x=short_histology_df['Short_Histology'].unique().tolist(),
                y=short_histology_df['Short_Histology'].unique().tolist()
                )
disp.show()
In [12]:
# What are the most important features?
# rfc2_feature_list = _df.columns
# rfc2_feature_list = rfc2_feature_list.drop('class')

# rfc2_imp_features = pd.Series(rfc2.feature_importances_, index=rfc2_feature_list)

# rfc2_imp_genes = rfc2_imp_features.sort_values(ascending=False).to_frame().reset_index()
# rfc2_imp_genes.columns = ["features", "importance"]

# rfc2_imp_genes_fil = rfc2_imp_genes[~(rfc2_imp_genes == 0.000000).any(axis=1)]
# rfc2_imp_genes_fil
In [13]:
# Display interactive Barplot of important miRNAs
# fig = px.bar(rfc2_imp_genes_fil, x=rfc2_imp_genes_fil.features, y=rfc2_imp_genes_fil.importance)
# fig.show()

Relapse¶

In [14]:
relapse_df = main_df.drop(['Specimen_Type', 'Diagnosis', 'Short_Histology', 'Tumor_Subtype', 'Survival_Status'],axis=1)
#relapse_df
In [15]:
# Split the dataset into features and labels
r_X = relapse_df.loc[:, relapse_df.columns != 'Relapse'].values
r_y = relapse_df.loc[:, relapse_df.columns == 'Relapse'].values.ravel()

# Split data into training and testing set
r_X_train, r_X_test, r_y_train, r_y_test = train_test_split(r_X, r_y, test_size=0.25, random_state=42)

#Sanity check
print(r_X_train.shape, r_X_test.shape, r_y_train.shape, r_y_test.shape)
(12, 2102) (5, 2102) (12,) (5,)
In [16]:
# Class Imbalance
fig = px.histogram(relapse_df, x='Relapse')
fig.show()
In [17]:
# Initialize random forest classifier
r_rf = RandomForestClassifier(max_depth=2, random_state=0)

# Train the random forest classifier
r_rf.fit(r_X_train, r_y_train)

# Make predictions using random forest classifier
r_rf_y_pred = r_rf.predict(r_X_test)
In [18]:
# Accuracy of model
print(f'Accuracy: {accuracy_score(r_y_test, r_rf_y_pred)}')
Accuracy: 0.8
In [19]:
# Calculate a confusion matrix
r_cm = confusion_matrix(r_y_test, r_rf_y_pred, labels=r_rf.classes_)

# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(r_cm, text_auto=True,
                labels=dict(x="True Relapse", y="Predicted Relapse", color="Productivity"),
                x=relapse_df['Relapse'].unique().tolist(),
                y=relapse_df['Relapse'].unique().tolist()
                )
disp.show()